In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("/storage"))

# Any results you write to the current directory are saved as output.
['retinopathy-train-2015', 'lost+found', 'aptosplus', 'diabetic-retinopathy-resized', 'aptos2019-blindness-detection', 'resnet101.pth']
In [37]:
import os, sys
import re
from multiprocessing import Pool
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage.io
from skimage.transform import resize
from scipy import ndimage
from imgaug import augmenters as iaa
from tqdm import tqdm
import PIL
from PIL import Image, ImageOps
import cv2
from sklearn.utils import class_weight, shuffle
from sklearn.metrics import f1_score, fbeta_score
from sklearn.model_selection import train_test_split
import seaborn as sns
from IPython.display import Image as ShowImage
from collections import Counter
import imageio

WORKERS = 2
CHANNEL = 3

import warnings
warnings.filterwarnings("ignore")
IMG_SIZE = 512
NUM_CLASSES = 5
SEED = 77
TRAIN_NUM = 1000 # use 1000 when you just want to explore new idea, use -1 for full train
In [3]:
df_train = pd.read_csv('/storage/aptos2019-blindness-detection/train.csv')
df_test = pd.read_csv('/storage/aptos2019-blindness-detection/test.csv')

x = df_train['id_code']
y = df_train['diagnosis']

x, y = shuffle(x, y, random_state=SEED)
In [4]:
df_test[:10]
Out[4]:
id_code
0 0005cfc8afb6
1 003f0afdcd15
2 006efc72b638
3 00836aaacf06
4 009245722fa4
5 009c019a7309
6 010d915e229a
7 0111b949947e
8 01499815e469
9 0167076e7089
In [38]:
def resolution_dist(df, img_sub_dir):
    channels_count = Counter()
    height_count = Counter()
    width_count = Counter()
    height_width_count = Counter()

    for i, row in df.iterrows():
            path=f"/storage/aptos2019-blindness-detection/{img_sub_dir}/{row['id_code']}.png"
            height, width, channels = imageio.imread(path).shape
            channels_count.update([channels])
            height_count.update([height])
            width_count.update([width])
            height_width_count.update([f'h{height}_w{width}'])

    print(channels_count)
    print(height_count)
    print(width_count)
    print(height_width_count)
    
    return channels_count, height_count, width_count, height_width_count
In [5]:
df_train.hist()
Out[5]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f91a5865cc0>]],
      dtype=object)
In [6]:
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.05,
                                                      stratify=y, random_state=SEED)
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)
train_y.hist()
valid_y.hist()
(3478,) (3478,) (184,) (184,)
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f91a2c6cba8>

1.1 Simple picture to explain Diabetic Retinopathy

How do we know that a patient have diabetic retinopahy? There are at least 5 things to spot on. Image credit https://www.eyeops.com/ credit : https://www.eyeops.com/

From quick investigations of the data (see various pictures below), I found that Hemorrphages, Hard Exudates and Cotton Wool spots are quite easily observed. However, I still could not find examples of Aneurysm or Abnormal Growth of Blood Vessels from our data yet. Perhaps the latter two cases are important if we want to catch up human benchmnark using our model.

1.2 Original Inputs

First, let have a glance of original inputs. Each row depicts each severity level. We can see two problems which make the severity difficult to spot on. First, some images are very dark [pic(0,2) and pic(4,4) ] and sometimes different color illumination is confusing [pic (3,3)]. Second, we can get the uninformative dark areas for some pictures [pic(0,1), pic(0,3)]. This is important when we reduce the picture size, as informative areas become too small. So it is intuitive to crop the uninformative areas out in the second case.

In [7]:
%%time
fig = plt.figure(figsize=(25, 16))
# display 10 images from each class
for class_id in sorted(train_y.unique()):
    for i, (idx, row) in enumerate(df_train.loc[df_train['diagnosis'] == class_id].sample(5, random_state=SEED).iterrows()):
        ax = fig.add_subplot(5, 5, class_id * 5 + i + 1, xticks=[], yticks=[])
        path=f"/storage/aptos2019-blindness-detection/train_images/{row['id_code']}.png"
        image = cv2.imread(path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))

        plt.imshow(image)
        ax.set_title('Label: %d-%d-%s' % (class_id, idx, row['id_code']) )
CPU times: user 3.48 s, sys: 256 ms, total: 3.74 s
Wall time: 3.62 s

We can try gray scale and feel understand better for some pictures, as color distraction is gone. For example, we can see more blood clearer in the upper part of pic(4,4), which has severity of level 4.

In [8]:
%%time
fig = plt.figure(figsize=(25, 16))
for class_id in sorted(train_y.unique()):
    for i, (idx, row) in enumerate(df_train.loc[df_train['diagnosis'] == class_id].sample(5, random_state=SEED).iterrows()):
        ax = fig.add_subplot(5, 5, class_id * 5 + i + 1, xticks=[], yticks=[])
        path=f"/storage/aptos2019-blindness-detection/train_images/{row['id_code']}.png"
        image = cv2.imread(path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#         image=cv2.addWeighted ( image, 0 , cv2.GaussianBlur( image , (0 ,0 ) , 10) ,-4 ,128)
        image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))

        plt.imshow(image, cmap='gray')
        ax.set_title('Label: %d-%d-%s' % (class_id, idx, row['id_code']) )
CPU times: user 3.29 s, sys: 108 ms, total: 3.4 s
Wall time: 3.28 s

For severity level 4, I feel that two examples here are difficult to spot on, pic(4,1) and pic(4,4). As we try zooming to see the details (use real size image), we can see some abnormalities (cotton wool spots or hard exudates ?) in those eyes clearer (observe the lower-right part of the eye). Therefore, IMG_SIZE is definitely important for this problem. In the next section, we shall see better method than gray-scale conversion.

In [9]:
dpi = 80 #inch

# path=f"../input/aptos2019-blindness-detection/train_images/5c7ab966a3ee.png" # notice upper part
path=f"/storage/aptos2019-blindness-detection/train_images/cd54d022e37d.png" # lower-right, this still looks not so severe, can be class3
image = cv2.imread(path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
height, width = image.shape
print(height, width)

SCALE=2
figsize = (width / float(dpi))/SCALE, (height / float(dpi))/SCALE

fig = plt.figure(figsize=figsize)
plt.imshow(image, cmap='gray')
2136 3216
Out[9]:
<matplotlib.image.AxesImage at 0x7f91a27280b8>

2. Try Ben Graham's preprocessing method.

In the last competition, Ben Graham (last competition's winner) share insightful way to improve lighting condition. Here, we apply his idea, and can see many important details in the eyes much better. For full details, please refer to his technical report in the link above.

In [10]:
%%time
fig = plt.figure(figsize=(25, 16))
for class_id in sorted(train_y.unique()):
    for i, (idx, row) in enumerate(df_train.loc[df_train['diagnosis'] == class_id].sample(5, random_state=SEED).iterrows()):
        ax = fig.add_subplot(5, 5, class_id * 5 + i + 1, xticks=[], yticks=[])
        path=f"/storage/aptos2019-blindness-detection/train_images/{row['id_code']}.png"
        image = cv2.imread(path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
        image=cv2.addWeighted ( image,4, cv2.GaussianBlur( image , (0,0) , IMG_SIZE/10) ,-4 ,128) # the trick is to add this line

        plt.imshow(image, cmap='gray')
        ax.set_title('Label: %d-%d-%s' % (class_id, idx, row['id_code']) )
CPU times: user 45.3 s, sys: 176 ms, total: 45.5 s
Wall time: 7.81 s

3. Further improve by auto-cropping

To crop out the uninformative black areas which are evident on pic(0,1), pic(0,3) and pic(4,1), we can try auto cropping. I found 4 alternative codes from https://stackoverflow.com/questions/13538748/crop-black-edges-with-opencv and https://codereview.stackexchange.com/questions/132914/crop-black-border-of-image-using-numpy/132934 ... Fortunately one method works perfectly for a gray scale image, but none works on a color image. In this kernel, I modify the method working on gray-scale a bit to make it suitable for a color image.

In [11]:
def crop_image1(img,tol=7):
    # img is image data
    # tol  is tolerance
        
    mask = img>tol
    return img[np.ix_(mask.any(1),mask.any(0))]

def crop_image_from_gray(img,tol=7):
    if img.ndim ==2:
        mask = img>tol
        return img[np.ix_(mask.any(1),mask.any(0))]
    elif img.ndim==3:
        gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        mask = gray_img>tol
        
        check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape[0]
        if (check_shape == 0): # image is too dark so that we crop out everything,
            return img # return original image
        else:
            img1=img[:,:,0][np.ix_(mask.any(1),mask.any(0))]
            img2=img[:,:,1][np.ix_(mask.any(1),mask.any(0))]
            img3=img[:,:,2][np.ix_(mask.any(1),mask.any(0))]
    #         print(img1.shape,img2.shape,img3.shape)
            img = np.stack([img1,img2,img3],axis=-1)
    #         print(img.shape)
        return img

Try Cropping the images

I have tested on around 200 images, and the method works great. However, if anybody find the outlier cases which cause the auto crop to fail, please let me know. I think now the eye pictures are very like the moon by the way :)

IMPORTANT UPDATE on Kernel V.9 I found that there is indeed a case in private test set making the old version of crop function fail. (I spent my 13 submissions until I found this bug) E.g. if there is an adversarial image (super dark) in the private test set, the crop function will crop everything and result in 0 dimension image. I have fixed this bug in this kernel version, but I still could not guarantee whether there are other cases in a private test that will make the crop function fail or not. Update on V11 Now I was able to have a valid LB score with the new crop function, so if anybody still have some submission errors, that is the reason of other bugs.

3.A Important Update on Color Version of Cropping & Ben's Preprocessing

At first, when I wrote this kernel, I could not make a color crop nicely, so I thought that gray scale is a better representation. Now I believe that color version is better, so from this point on I will use color cropping

Below is the cropped of the color version. For color version, note that I use argument sigmaX = 30 of cv2.GaussianBlur, where Ben actually used sigmaX = 10 which may have better performance. I just feel that this sigmaX = 30 or sigmaX = 50 make beautiful [sometimes bloody] yellow moon pictures. Just for the purpose of illustration.

Please refer to https://www.tutorialkart.com/opencv/python/opencv-python-gaussian-image-smoothing/ .

In [12]:
def load_ben_color(path, sigmaX=10):
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = crop_image_from_gray(image)
    image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
    image=cv2.addWeighted ( image,4, cv2.GaussianBlur( image , (0,0) , sigmaX) ,-4 ,128)
        
    return image
In [13]:
%%time

NUM_SAMP=7
fig = plt.figure(figsize=(25, 16))
for class_id in sorted(train_y.unique()):
    for i, (idx, row) in enumerate(df_train.loc[df_train['diagnosis'] == class_id].sample(NUM_SAMP, random_state=SEED).iterrows()):
        ax = fig.add_subplot(5, NUM_SAMP, class_id * NUM_SAMP + i + 1, xticks=[], yticks=[])
        path=f"/storage/aptos2019-blindness-detection/train_images/{row['id_code']}.png"
        image = load_ben_color(path,sigmaX=30)

        plt.imshow(image)
        ax.set_title('%d-%d-%s' % (class_id, idx, row['id_code']) )
CPU times: user 33.6 s, sys: 184 ms, total: 33.8 s
Wall time: 11.2 s

3.A2 Try the new idea of circle crop

@taindow proposes an interesting idea of making a circle crop to the image, so I update the kernel to let you compare the results. Credit : https://www.kaggle.com/taindow/pre-processing-train-and-test-images ... Observe that we now get a magic circle, but by using circle crop, some scabs/wools may get loss.

In [16]:
def circle_crop(img, sigmaX=10):   
    """
    Create circular crop around image centre    
    """    
    
    img = cv2.imread(img)
    img = crop_image_from_gray(img)    
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    height, width, depth = img.shape    
    
    x = int(width/2)
    y = int(height/2)
    r = np.amin((x,y))
    
    circle_img = np.zeros((height, width), np.uint8)
    cv2.circle(circle_img, (x,y), int(r), 1, thickness=-1)
    img = cv2.bitwise_and(img, img, mask=circle_img)
    img = crop_image_from_gray(img)
    img=cv2.addWeighted ( img,4, cv2.GaussianBlur( img , (0,0) , sigmaX) ,-4 ,128)
    return img
In [17]:
%%time
## try circle crop
NUM_SAMP=1
for class_id in [1, 2]:
    for i, (idx, row) in enumerate(df_train.loc[df_train['diagnosis'] == class_id].sample(NUM_SAMP, random_state=SEED).iterrows()):
        ax = fig.add_subplot(5, NUM_SAMP, class_id * NUM_SAMP + i + 1, xticks=[], yticks=[])
        path=f"/storage/aptos2019-blindness-detection/train_images/{row['id_code']}.png"
        image = circle_crop(path,sigmaX=30)
        image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))

        plt.imshow(image)
CPU times: user 8.3 s, sys: 24 ms, total: 8.32 s
Wall time: 1.81 s

We can try plotting a picture (sample train pic(4,1) above) with IMG_SIZE with cropping, now important information is much clearer to see with sigmaX = 10

In [18]:
dpi = 80 #inch

# path=f"../input/aptos2019-blindness-detection/train_images/5c7ab966a3ee.png" # notice upper part
path=f"/storage/aptos2019-blindness-detection/train_images/cd54d022e37d.png" # lower-right, can be class3
image = load_ben_color(path,sigmaX=10)

height, width = IMG_SIZE, IMG_SIZE
print(height, width)

SCALE=1
figsize = (width / float(dpi))/SCALE, (height / float(dpi))/SCALE

fig = plt.figure(figsize=figsize)
plt.imshow(image, cmap='gray')
512 512
Out[18]:
<matplotlib.image.AxesImage at 0x7f91a2655fd0>

Try the method on Public Test Set

We can also try auto cropping on 50 test data to see that it work fine. Below, we see immediately from this random samples that severed cases, with level >2, are relatively many more compared to the training set.

In [19]:
%%time
NUM_SAMP=10
fig = plt.figure(figsize=(25, 16))
for jj in range(5):
    for i, (idx, row) in enumerate(df_test.sample(NUM_SAMP,random_state=SEED+jj).iterrows()):
        ax = fig.add_subplot(5, NUM_SAMP, jj * NUM_SAMP + i + 1, xticks=[], yticks=[])
        path=f"/storage/aptos2019-blindness-detection/test_images/{row['id_code']}.png"
        image = load_ben_color(path,sigmaX=30)
        
        plt.imshow(image)
        ax.set_title('%d-%s' % (idx, row['id_code']) )
CPU times: user 40.2 s, sys: 92 ms, total: 40.2 s
Wall time: 8.13 s
In [20]:
%%time
'''Bonus : sigmaX=50'''
NUM_SAMP=10
fig = plt.figure(figsize=(25, 16))
for jj in range(5):
    for i, (idx, row) in enumerate(df_test.sample(NUM_SAMP,random_state=SEED+jj).iterrows()):
        ax = fig.add_subplot(5, NUM_SAMP, jj * NUM_SAMP + i + 1, xticks=[], yticks=[])
        path=f"/storage/aptos2019-blindness-detection/test_images/{row['id_code']}.png"
        image = load_ben_color(path,sigmaX=50)

        plt.imshow(image, cmap='gray')
        ax.set_title('%d-%s' % (idx, row['id_code']) )
CPU times: user 2min 3s, sys: 100 ms, total: 2min 3s
Wall time: 18.3 s

4. Try the same method to Past competition data

Thanks @tanlikesmath, https://www.kaggle.com/tanlikesmath/diabetic-retinopathy-resized who provides us a complete previous-competition dataset in the .jpeg format; this is much smaller than the original version with the risk of losing image details. Let apply both normal gray scale, and Ben Graham's method to this dataset.

In [21]:
!ls /storage/diabetic-retinopathy-resized/
resized_train  resized_train_cropped  trainLabels.csv  trainLabels_cropped.csv
In [22]:
!ls /storage/diabetic-retinopathy-resized/resized_train/resized_train | head
10003_left.jpeg
10003_right.jpeg
10007_left.jpeg
10007_right.jpeg
10009_left.jpeg
10009_right.jpeg
1000_left.jpeg
1000_right.jpeg
10010_left.jpeg
10010_right.jpeg
ls: write error: Broken pipe
In [23]:
df_old = pd.read_csv('/storage/retinopathy-train-2015/trainLabels.csv')

df_old.head()
Out[23]:
image level
0 10_left 0
1 10_right 0
2 13_left 0
3 13_right 0
4 15_left 1
In [24]:
NUM_SAMP=10
fig = plt.figure(figsize=(25, 16))
for class_id in sorted(train_y.unique()):
    for i, (idx, row) in enumerate(df_old.loc[df_old['level'] == class_id].sample(NUM_SAMP, random_state=SEED).iterrows()):
        ax = fig.add_subplot(5, NUM_SAMP, class_id * NUM_SAMP + i + 1, xticks=[], yticks=[])
        path=f"/storage/retinopathy-train-2015/rescaled_train_896/{row['image']}.png"
        image = load_ben_color(path,sigmaX=10)

        plt.imshow(image)
        ax.set_title('%d-%d-%s' % (class_id, idx, row['image']) )

Below is the unpreprocess version, just for comparison

In [58]:
NUM_SAMP=10
fig = plt.figure(figsize=(25, 16))
for class_id in sorted(train_y.unique()):
    for i, (idx, row) in enumerate(df_old.loc[df_old['level'] == class_id].sample(NUM_SAMP, random_state=SEED).iterrows()):
        ax = fig.add_subplot(5, NUM_SAMP, class_id * NUM_SAMP + i + 1, xticks=[], yticks=[])
        print(row['image'])
        path=f"/storage/diabetic-retinopathy-resized/resized_train/resized_train/{row['image']}.jpeg"
        image = cv2.imread(path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

#         image = crop_image_from_gray(image)
        image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
#         image=cv2.addWeighted ( image,4, cv2.GaussianBlur( image , (0,0) , IMG_SIZE/10) ,-4 ,128)

        plt.imshow(image, cmap='gray')
        ax.set_title('%d-%d-%s' % (class_id, idx, row['image']) )
15539_left
3580_right
15817_left
40169_left
21856_left
26007_left
34338_right
22497_right
329_left
6938_left
13522_left
6865_left
19462_left
32695_left
4434_left
10030_right
39834_left
7172_right
35711_left
13517_left
32159_left
41173_left
37647_left
34321_left
11288_left
19120_right
3202_left
1541_left
28807_left
33996_right
21251_right
15056_left
13716_left
5823_right
16201_left
4366_right
13395_left
23453_left
34571_right
22930_right
22379_left
31590_right
986_left
43199_right
9419_right
4909_left
15038_left
670_right
18017_left
7164_left

Ok preprocessing methods seem to works fine; however, the doctors to estimate the severity levels in the past competitions may have different criteria in mind than the doctors of Aravind, so it is possible to have some estimation inconsistency (at least to my eyes the previous data seems more noisy). The following level-4 [pic(4,1) in the plot we just made above] looks not so severe. (Or this is the example case of too many blood vessels ??, refer to Section 1.1)

In [59]:
dpi = 80 #inch

path=f"/storage/diabetic-retinopathy-resized/resized_train/resized_train/31590_right.jpeg" # too many vessels?
# path=f"../input/diabetic-retinopathy-resized/resized_train/resized_train/18017_left.jpeg" # details are lost
image = load_ben_color(path,sigmaX=30)
# image = cv2.imread(path)
# image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# image = crop_image1(image)
# image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
# image=cv2.addWeighted ( image,4, cv2.GaussianBlur( image , (0,0) , IMG_SIZE/10) ,-4 ,128)

height, width = IMG_SIZE, IMG_SIZE
print(height, width)

SCALE=1
figsize = (width / float(dpi))/SCALE, (height / float(dpi))/SCALE

fig = plt.figure(figsize=figsize)
plt.imshow(image, cmap='gray')
512 512
Out[59]:
<matplotlib.image.AxesImage at 0x7fe2f489c2b0>
In [ ]:

In [57]:
df_train2 = pd.read_csv('/storage/aptos2019-blindness-detection/train.csv')
Out[57]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f67c1380a58>]],
      dtype=object)
In [60]:
df_train2.head()
Out[60]:
id_code diagnosis
0 000c1434d8d7 2
1 001639a390f0 4
2 0024cdab0c1e 1
3 002c21358ce6 0
4 005b95c28852 0
In [ ]:
df_train2.hist()
In [66]:
df_train2.groupby('diagnosis').agg(['count'])
Out[66]:
id_code
count
diagnosis
0 1805
1 370
2 999
3 193
4 295
In [68]:
df_2015_1 = pd.read_csv('/storage/diabetic-retinopathy-resized/trainLabels.csv')
df_2015_1.head()
Out[68]:
image level
0 10_left 0
1 10_right 0
2 13_left 0
3 13_right 0
4 15_left 1
In [69]:
df_2015_1.groupby('level').agg(['count'])
Out[69]:
image
count
level
0 25810
1 2443
2 5292
3 873
4 708
In [70]:
df_2015_2 = pd.read_csv('/storage/diabetic-retinopathy-resized/trainLabels_cropped.csv')
df_2015_2.head()
Out[70]:
Unnamed: 0 Unnamed: 0.1 image level
0 0 0 10_left 0
1 1 1 10_right 0
2 2 2 13_left 0
3 3 3 13_right 0
4 4 4 15_left 1
In [71]:
df_2015_2.groupby('level').agg(['count'])
Out[71]:
Unnamed: 0 Unnamed: 0.1 image
count count count
level
0 25802 25802 25802
1 2438 2438 2438
2 5288 5288 5288
3 872 872 872
4 708 708 708
In [72]:
!ls /storage/retinopathy-train-2015
rescaled_train_896  trainLabels.csv
In [73]:
df_2015_3 = pd.read_csv('/storage/retinopathy-train-2015/trainLabels.csv')
df_2015_3.head()
Out[73]:
image level
0 10_left 0
1 10_right 0
2 13_left 0
3 13_right 0
4 15_left 1
In [74]:
df_2015_3.groupby('level').agg(['count'])
Out[74]:
image
count
level
0 25810
1 2443
2 5292
3 873
4 708

Merge and write final dataset

We will merge the original dataset with the retinopathy-train-2015 dataset for our train dataset.

To do this, we'll need to proceed with the following steps:

  1. Transform the id into the full path of the data so we don't have to worry about it anymore. For both datasets.
  2. Preprocess dataset according to https://arxiv.org/pdf/1812.10595.pdf and this notebook.
  3. Write new dataset and csv to disk.

CSV part

In [25]:
root_path = '/storage'
df_train1 = pd.read_csv(f'{root_path}/aptos2019-blindness-detection/train.csv')
df_test = pd.read_csv(f'{root_path}/aptos2019-blindness-detection/test.csv')

df_train2 = pd.read_csv(f'{root_path}/retinopathy-train-2015/trainLabels.csv')
In [26]:
df_train1.head(1)
Out[26]:
id_code diagnosis
0 000c1434d8d7 2
In [27]:
df_train2.head(1)
Out[27]:
image level
0 10_left 0
In [28]:
df_train2 = df_train2.rename(columns={'image': 'id_code', 'level': 'diagnosis'})
df_train2.head(1)
Out[28]:
id_code diagnosis
0 10_left 0
In [29]:
df_train_all = pd.concat([df_train1, df_train2])
In [30]:
df_train_all
Out[30]:
id_code diagnosis
0 000c1434d8d7 2
1 001639a390f0 4
2 0024cdab0c1e 1
3 002c21358ce6 0
4 005b95c28852 0
5 0083ee8054ee 4
6 0097f532ac9f 0
7 00a8624548a9 2
8 00b74780d31d 2
9 00cb6555d108 1
10 00cc2b75cddd 0
11 00e4ddff966a 2
12 00f6c1be5a33 0
13 0104b032c141 3
14 0124dffecf29 1
15 0125fbd2e791 0
16 012a242ac6ff 2
17 014508ccb9cb 0
18 0151781fe50b 0
19 0161338f53cc 2
20 0180bfa26c0b 2
21 0182152c50de 0
22 01b3aed3ed4c 1
23 01c7808d901d 2
24 01d9477b1171 0
25 01eb826f6467 2
26 01f7bb8be950 0
27 0212dd31f623 0
28 022f820027b8 0
29 0231642cf1c2 0
... ... ...
35096 44317_left 0
35097 44317_right 0
35098 44323_left 1
35099 44323_right 1
35100 44325_left 0
35101 44325_right 0
35102 44327_left 0
35103 44327_right 0
35104 44328_left 0
35105 44328_right 1
35106 44330_left 0
35107 44330_right 0
35108 44331_left 0
35109 44331_right 0
35110 44334_left 0
35111 44334_right 0
35112 44337_left 1
35113 44337_right 0
35114 44338_left 0
35115 44338_right 0
35116 44339_left 0
35117 44339_right 0
35118 44343_left 0
35119 44343_right 0
35120 44347_left 0
35121 44347_right 0
35122 44348_left 0
35123 44348_right 0
35124 44349_left 0
35125 44349_right 1

38788 rows × 2 columns

In [31]:
!mkdir -p /storage/aptosplus
!mkdir -p /storage/aptosplus/aptos
!mkdir -p /storage/aptosplus/eyepacs
In [32]:
eyepacs = re.compile(r'(left|right)')
def set_ds(c):
    id_code = c['id_code']
    return 'eyepacs' if eyepacs.search(id_code) else 'aptos'

df_train_all['ds'] = df_train_all.apply(set_ds, axis=1)
df_train_all.head()
Out[32]:
id_code diagnosis ds
0 000c1434d8d7 2 aptos
1 001639a390f0 4 aptos
2 0024cdab0c1e 1 aptos
3 002c21358ce6 0 aptos
4 005b95c28852 0 aptos
In [33]:
df_train_all.to_csv('/storage/aptosplus/trainLabels.csv', index=False)
In [34]:
!head -10 /storage/aptosplus/trainLabels.csv
id_code,diagnosis,ds
000c1434d8d7,2,aptos
001639a390f0,4,aptos
0024cdab0c1e,1,aptos
002c21358ce6,0,aptos
005b95c28852,0,aptos
0083ee8054ee,4,aptos
0097f532ac9f,0,aptos
00a8624548a9,2,aptos
00b74780d31d,2,aptos

Images part

In [44]:
def write_images_to_disk_multi(packed_args):
    i, row = packed_args
    if row['ds'] == 'aptos':
        image = load_ben_color(f'{root_path}/aptos2019-blindness-detection/train_images/{row["id_code"]}.png')
        cv2.imwrite(f'{root_path}/aptosplus/aptos/{row["id_code"]}.png', image)                      
    else:
        image = load_ben_color(f'{root_path}/retinopathy-train-2015/rescaled_train_896/{row["id_code"]}.png')
        cv2.imwrite(f'{root_path}/aptosplus/eyepacs/{row["id_code"]}.png', image)
In [47]:
%%time
pool = Pool()                   
pool.map(write_images_to_disk_multi, df_train_all.iterrows())
CPU times: user 10.8 s, sys: 2.3 s, total: 13.1 s
Wall time: 14min 16s
Out[47]:
[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 ...]